package tools.crawler.scenic.cthy.com;

import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.junit.Test;

import com.easywan.util.StringsUtils;

import tools.crawler.utils.HttpRequest;

public class Cthy {
	private static Map<String,String> citys = null;
	public Cthy(){
		if(citys == null){
			citys = new HashMap<>();
			citys.put("深圳", "http://scenic.cthy.com/scenicSearch/44-0-0-0-0-1.html");
		}
	}
	@Test
	public void test(){
		new Cthy().start("深圳");
	
	}
	public void start(String name){
		//1.加载主页
	/*	String indeUrl = "http://scenic.cthy.com/scenicSearch/0-0-201-0-0-1.html";
		String start = "<div class=\"china\">",end = "<div class=\"topic\">";
		String hrefRegex = "<a href=\"(.+?)\">[\u4E00-\u9FFF]*</a>";
		
		List<String> urlMap = get(indeUrl,start,end,hrefRegex);
		*/
		//2.分析城市
		String hrefRegex = "<a href=\'000\' target=\'_blank\'";
		List<String> cityMap = get(citys.get(name),"<div class=\"sightlist\">",
				"<div id=\"pagelist\">",hrefRegex
				);
		for(int i = 0 ; i < cityMap.size() ; i++){
			if(cityMap.get(i).startsWith("http") && cityMap.get(i).length() < 60){
				
			}else{
				cityMap.remove(i); i--;
			}
		}
		System.out.println(cityMap);
		//3.分析子城市
		//4.获取文章连接
		//5.获取文章内容
	}

	private List<String> get(String indexUrl,String start,String end,String hrefRegex) {
		
		String html = HttpRequest.get(indexUrl).body();
		html = html.substring(html.indexOf(start)+start.length()
				, html.lastIndexOf(end));
		List<String> ls = StringsUtils.mathes2(html,hrefRegex);
		
		//System.out.println(html);
		/*
		List<String> names = StringsUtils.mathes(html,nameRegex);
		System.out.println(names);
		Map<String,String> map = new HashMap<>();
		for(int i =0; i < names.size(); i++){
			System.out.println(names.get(i)+":"+ls.get(i));
			map.put(names.get(i), ls.get(i));
		}*/
		return ls;
	}
}
